qPCR data QC

Author

Laura Symul & Laura Vermeren

Published

June 14, 2025

Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)

tmp <- fs::dir_map("R scripts/", source)
tmp <- fs::dir_map("../VIBRANT-99-utils/R/", source)
rm(tmp)

theme_set(theme_light())

Loading the data

Code
data_source <- "real"
qpcr_dir <- str_c(get_data_dir(data_source), "00 Raw/03 qPCR/")
file <- "VIBRANT_qPCR_data_merged_250611.csv"
cat("We load the file", file, " (15 LBP strains)\n\n")
We load the file VIBRANT_qPCR_data_merged_250611.csv  (15 LBP strains)
Code
qpcr_LBP <- read_csv(str_c(qpcr_dir, file))

file <- "VIBRANT_16SqPCR_20250514.csv"
cat("\n\nAnd the file ", file, " (16S)\n\n")


And the file  VIBRANT_16SqPCR_20250514.csv  (16S)
Code
qpcr_16S <- read_csv(str_c(qpcr_dir, file))

rm(qpcr_dir, file)

The data is provided in long format:

Code
qpcr_LBP |> glimpse()
Rows: 54,810
Columns: 21
$ Data_row                 <dbl> 2275, 2276, 2277, 1603, 1604, 1605, 2272, 227…
$ Well                     <chr> "E14", "E15", "E16", "E14", "E15", "E16", "E1…
$ Fluor                    <chr> "HEX", "HEX", "HEX", "Cy5", "Cy5", "Cy5", "FA…
$ Content                  <chr> "Unkn-23", "Unkn-23", "Unkn-23", "Unkn-23", "…
$ Replicate                <dbl> 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 2…
$ Sample                   <chr> "C5", "C5", "C5", "C5", "C5", "C5", "C5", "C5…
$ Cq                       <dbl> 30.91797, 30.97083, 30.74273, NA, NA, NA, NA,…
$ `Starting Quantity (SQ)` <dbl> 97.7864496, 94.2051474, 110.6643855, NA, NA, …
$ `Cq Mean`                <dbl> 30.87718, 30.87718, 30.87718, 0.00000, 0.0000…
$ plate_id                 <chr> "B063809", "B063809", "B063809", "B063810", "…
$ Plate_number             <dbl> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1…
$ gDNA_Plate_ID            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ VMRC_Group               <chr> "VMRC_4", "VMRC_4", "VMRC_4", "VMRC_3", "VMRC…
$ Group_Fluor              <chr> "VMRC_4_HEX", "VMRC_4_HEX", "VMRC_4_HEX", "VM…
$ Target                   <chr> "122010", "122010", "122010", "185329", "1853…
$ gDNA_Plate_Well          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ VIBR_Sample_ID           <chr> "551", "551", "551", "551", "551", "551", "55…
$ InRedos                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ Dilution_Factor          <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
$ Quant_Adjusted           <dbl> 488.932248, 471.025737, 553.321928, 0.000000,…
$ Copies_per_swab          <dbl> 122233.0620, 117756.4343, 138330.4819, 0.0000…

Merging 16S and LBP qPCR data

Since the two files have almost exactly the same columns, we merge them into a single file after some harmonization.

First, we notice that there are some clinical samples that have data for the LBP qPCR, but not for the 16S qPCR.

Code
dplyr::full_join(
  qpcr_16S |> select(VIBR_Sample_ID, gDNA_Plate_ID) |> mutate(has_16S = TRUE) |> distinct(),
  qpcr_LBP |> select(VIBR_Sample_ID, gDNA_Plate_ID) |> mutate(has_LBP = TRUE) |> distinct()
) |>  dplyr::filter(is.na(has_16S) | is.na(has_LBP)) |> 
  dplyr::filter(!is.na(VIBR_Sample_ID), !(VIBR_Sample_ID %in% c("#N/A", "empty"))) |> 
  mutate(index = row_number()) |> 
  select(index, everything()) |> 
  gt(
    caption = "Clinical samples with data for LBP qPCR but not for 16S qPCR, or vice versa."
  )
Clinical samples with data for LBP qPCR but not for 16S qPCR, or vice versa.
index VIBR_Sample_ID gDNA_Plate_ID has_16S has_LBP
1 551 NA NA TRUE
2 972 NA NA TRUE
3 1167 NA NA TRUE
4 1291 NA NA TRUE
5 1588 NA NA TRUE
6 2646 NA NA TRUE
7 3344 NA NA TRUE
8 3458 NA NA TRUE
9 3842 NA NA TRUE
10 3968 NA NA TRUE
11 4112 NA NA TRUE
12 2152252 NA NA TRUE
13 2179903 NA NA TRUE
14 2181144 NA NA TRUE
15 2182677 NA NA TRUE
16 2183297 NA NA TRUE
17 2187012 NA NA TRUE
18 2192710 NA NA TRUE
19 2193899 NA NA TRUE
20 2196625 NA NA TRUE
21 2196631 NA NA TRUE
22 2229330 NA NA TRUE
23 2250465 NA NA TRUE
24 2251190 NA NA TRUE
25 2265171 NA NA TRUE
26 2267328 NA NA TRUE
27 2268171 NA NA TRUE
28 2268611 NA NA TRUE
29 2274226 NA NA TRUE
30 2274241 NA NA TRUE
31 2275000 NA NA TRUE
32 2275799 NA NA TRUE
33 2276001 NA NA TRUE
34 2277248 NA NA TRUE
35 2277845 NA NA TRUE
36 2281503 NA NA TRUE
37 2282879 NA NA TRUE
38 2284143 NA NA TRUE
39 2284198 NA NA TRUE
40 2284891 NA NA TRUE
41 2284937 NA NA TRUE
42 2285110 NA NA TRUE
43 2285838 NA NA TRUE
44 2287336 NA NA TRUE
45 2287920 NA NA TRUE
46 2289558 NA NA TRUE
47 2290513 NA NA TRUE
48 2290535 NA NA TRUE
49 2290557 NA NA TRUE
50 2290679 NA NA TRUE
51 2290691 NA NA TRUE
52 2290702 NA NA TRUE
53 2291245 NA NA TRUE
54 2291251 NA NA TRUE
55 2291257 NA NA TRUE
56 2291270 NA NA TRUE
57 2291276 NA NA TRUE
58 2291282 NA NA TRUE
59 2297239 NA NA TRUE
60 2298406 NA NA TRUE
61 2298988 NA NA TRUE
62 2300204 NA NA TRUE
63 2304216 NA NA TRUE
64 2304663 NA NA TRUE
65 2305944 NA NA TRUE
66 2309505 NA NA TRUE
67 2171048 2240890 NA TRUE

We also note that there is one sample on the same plate that has 6 replicates instead of 3 in the 16S qPCR.

Code
qpcr_16S |> dplyr::filter(!is.na(VIBR_Sample_ID)) |> dplyr::count(VIBR_Sample_ID, gDNA_Plate_ID ) |> dplyr::filter(n > 3)
# A tibble: 1 × 3
  VIBR_Sample_ID gDNA_Plate_ID     n
  <chr>                  <dbl> <int>
1 2167612              2240890     6
Code
qpcr_16S |> dplyr::filter(VIBR_Sample_ID == "2167612") |> select(Data_row, Well, Sample, Cq, Copies_per_swab)
# A tibble: 6 × 5
  Data_row Well  Sample    Cq Copies_per_swab
     <dbl> <chr> <chr>  <dbl>           <dbl>
1      163 I08   E4      22.4      218362435.
2      178 J07   E4      22.4      213446847.
3      179 J08   E4      22.4      222995668.
4      242 M10   G5      22.6      181004723.
5      258 N09   G5      22.5      195241505.
6      259 N10   G5      22.8      157625548.

As discussed with Michael, this is a mislabeling and these 6 are 3 replicates for 2 different samples. Since we have identified this issue already, we will fix it later in the code, using the extraction library plate number and position.

Code
# We rename the standards in the 16S qPCR so they are numbered
std_16s <-
  qpcr_16S |> 
  dplyr::filter(str_detect(Content, "Std")) |> 
  select(Content, `Starting Quantity (SQ)`) |> 
  distinct() |> 
  arrange(`Starting Quantity (SQ)` |> desc()) |>
  mutate(std_nb = row_number())
  
qpcr_16S <- 
  qpcr_16S |> 
  dplyr::left_join(std_16s, by = join_by(Content, `Starting Quantity (SQ)`)) |> 
  mutate(
    Content = Content |> str_c("-", std_nb |> str_pad(width = 2, pad = "0")),
    Target = "16S"
        ) |> 
  select(-std_nb)

# We transform the VIBR_Sample_ID column to be consistent with the 16S qPCR
qpcr_LBP <- 
  qpcr_LBP |> 
  mutate(
    VIBR_Sample_ID = ifelse(VIBR_Sample_ID == "#N/A", NA_character_, VIBR_Sample_ID)
  )

# we merge the two datasets
qpcr <- 
  bind_rows(
    qpcr_LBP |> mutate(target_type = "LBP"), 
    qpcr_16S |> mutate(target_type = "16S rRNA")
    ) |> 
  relocate(target_type, .after = Target)
Code
dictionary <- 
  tibble(original_name = colnames(qpcr)) |> 
  mutate(
    description = 
      case_when(
        (original_name == "Data_row") ~ 'Table row number',
        (original_name == "Well") ~ str_c("qPCR plate well ID. From ", min(qpcr$Well), " to ", max(qpcr$Well)),
        (original_name == "Fluor") ~ str_c('fluorescent dye used (one of ', qpcr$Fluor |> unique() |> str_c(collapse = ", "), ")"),
        (original_name == "Content") ~ 'Indicates whether well contains a sample, a standard, or a negative control.',
        (original_name == "Replicate") ~ 
          str_c(
            "The replicate number from ", min(qpcr$Replicate, na.rm = TRUE), " to ", max(qpcr$Replicate, na.rm = TRUE), 
            " (with ", sum(is.na(qpcr$Replicate)), " missing values). Values are repeated 495, or 990 times."
          ),
        (original_name == "Sample") ~ 'A qPCR plate-specific sample ID. This column wont be used or kept for downstream analyses.',
        (original_name == "Cq") ~ 'Quantification cycle (Cq) value.',
        (original_name == "Starting Quantity (SQ)") ~ 'Starting quantity (SQ) value; computed from the Cq values using the inverse calibration function (calibration curve fitted using the expected SQ values for the standards).',
        (original_name == "Cq Mean") ~ 'Mean across replicates',
        (original_name == "plate_id") ~ str_c('qPCR plate ID; there are ', qpcr$plate_id |> unique() |> length()," unique plate IDs in total."),
        (original_name == "Plate_number") ~ str_c("The extraction plate number; there are ", qpcr$Plate_number |> unique() |> length()," unique plate numbers in total (5 identical `plate_id` per `Plate_number`)."),
        (original_name == "gDNA_Plate_ID") ~ 'The extraction plate ID',
        (original_name == "VMRC_Group") ~ str_c('The group of the LBP strains. There are ', qpcr$VMRC_Group |> unique() |> na.omit() |> length()," unique VMRC groups in total (", qpcr$VMRC_Group |> unique() |> na.omit() |> sort() |> str_c(collapse = ", "), ")"),
        (original_name == "Group_Fluor") ~ 'Concatenation of the `VMRC_Group` and `Fluor` columns',
        (original_name == "Target") ~ str_c('The target "gene" (here taxa/strain). There are ', qpcr$Target |> unique() |> length(), " unique targets in total (", qpcr$Target |> unique() |> str_c(collapse = ", "),")"),
        (original_name == "target_type") ~ "Whether the target is a LBP strain or the 16S rRNA gene target",
        (original_name == "gDNA_Plate_Well") ~ str_c('The concatenation of the gDNA plate ID and the plate well; there are ', qpcr$gDNA_Plate_Well |> unique() |> length()," unique gDNA plate wells in total in the format `[gDNA_Plate_ID]_[Well]` (e.g., ", qpcr$gDNA_Plate_Well |> unique() |> extract(10),")"),
        (original_name == "VIBR_Sample_ID") ~ str_c("VIBRANT sample ID; there are ", qpcr$VIBR_Sample_ID |> unique() |> length(), " unique VIBRANT sample IDs in total)"),
        (original_name == "Dilution_Factor") ~ str_c('The qPCR plate specific dilution factor. There are ', qpcr$Dilution_Factor |> unique() |> length()," unique dilution factors in total (", qpcr$Dilution_Factor |> unique()  |> sort() |> str_c(collapse = ", "),")"),
        (original_name == "Quant_Adjusted") ~ str_c("Quantification value adjusted by the dilution factor; range from ", min(qpcr$Quant_Adjusted, na.rm = TRUE)," to ", max(qpcr$Quant_Adjusted, na.rm = TRUE)),
        (original_name == "Copies_per_swab") ~ str_c("The number of target copies per swab, computed from the adjusted quantification values. Ranges from ", min(qpcr$Copies_per_swab, na.rm = TRUE)," to ", max(qpcr$Copies_per_swab, na.rm = TRUE)),
         (original_name == "InRedos") ~ str_c("Whether the sample was part of the samples that were redone (= re-extracted)."),
        TRUE ~ "????"
      )
  )
Code
# dictionary |> gt()

Tidy names

We tidy the column names for consistency and readability.

Code
qpcr <- qpcr |> janitor::clean_names()
qpcr <- qpcr |> 
  dplyr::rename(
    strain_group_qpcr = vmrc_group,
    starting_quantity = starting_quantity_sq,
    pcr_plate_id = plate_id,
    ext_lib_plate_nb = plate_number,
    ext_lib_plate_id = g_dna_plate_id,
    ext_lib_position = g_dna_plate_well
  )
Code
dictionary <- 
  dictionary |> 
  mutate(name = colnames(qpcr)) |> 
  select(name, everything())

The new column names are:

Code
dictionary |> 
  gt() |> 
  cols_label(
    name = "New column name",
    original_name = "Original column name",
    description = "Description"
  ) 
New column name Original column name Description
data_row Data_row Table row number
well Well qPCR plate well ID. From A01 to P24
fluor Fluor fluorescent dye used (one of HEX, Cy5, FAM, SYBR)
content Content Indicates whether well contains a sample, a standard, or a negative control.
replicate Replicate The replicate number from 1 to 96 (with 3972 missing values). Values are repeated 495, or 990 times.
sample Sample A qPCR plate-specific sample ID. This column wont be used or kept for downstream analyses.
cq Cq Quantification cycle (Cq) value.
starting_quantity Starting Quantity (SQ) Starting quantity (SQ) value; computed from the Cq values using the inverse calibration function (calibration curve fitted using the expected SQ values for the standards).
cq_mean Cq Mean Mean across replicates
pcr_plate_id plate_id qPCR plate ID; there are 71 unique plate IDs in total.
ext_lib_plate_nb Plate_number The extraction plate number; there are 12 unique plate numbers in total (5 identical `plate_id` per `Plate_number`).
ext_lib_plate_id gDNA_Plate_ID The extraction plate ID
strain_group_qpcr VMRC_Group The group of the LBP strains. There are 5 unique VMRC groups in total (VMRC_1, VMRC_2, VMRC_3, VMRC_4, VMRC_5)
group_fluor Group_Fluor Concatenation of the `VMRC_Group` and `Fluor` columns
target Target The target "gene" (here taxa/strain). There are 16 unique targets in total (122010, 185329, C0006A1, C0022A1, C0028A1, C0059E1, C0112A1, C0175A1, FF00004, FF00018, FF00051, FF00064, FF00072, UC101, UC119, 16S)
target_type target_type Whether the target is a LBP strain or the 16S rRNA gene target
ext_lib_position gDNA_Plate_Well The concatenation of the gDNA plate ID and the plate well; there are 1155 unique gDNA plate wells in total in the format `[gDNA_Plate_ID]_[Well]` (e.g., 2240900_G2)
vibr_sample_id VIBR_Sample_ID VIBRANT sample ID; there are 1031 unique VIBRANT sample IDs in total)
in_redos InRedos Whether the sample was part of the samples that were redone (= re-extracted).
dilution_factor Dilution_Factor The qPCR plate specific dilution factor. There are 5 unique dilution factors in total (5, 10, 20, 30)
quant_adjusted Quant_Adjusted Quantification value adjusted by the dilution factor; range from 0 to 3e+08
copies_per_swab Copies_per_swab The number of target copies per swab, computed from the adjusted quantification values. Ranges from 0 to 7.5e+10
Code
qpcr <- 
  qpcr |> 
  mutate(
    in_redos = (in_redos == 1) |> replace_na(FALSE) # Convert to logical
  )

Sample types and unique sample IDs

We define a qpcr_sample_type variable to summarize information from vibr_sample_id and sample columns.

Code
qpcr <-  
  qpcr |> 
  mutate(
    qpcr_sample_type = 
      case_when(
        is.na(vibr_sample_id) & str_detect(content, "Std") ~ "Standard",
        is.na(vibr_sample_id) & str_detect(sample, "water") ~ "Water",
        str_detect(vibr_sample_id, "EQ") ~ "Control sample",
        str_detect(vibr_sample_id, "empty") ~ "Empty",
        is.na(vibr_sample_id) ~ "Water or empty",
        TRUE ~ "VIBRANT clinical sample"
      )
  )
Code
# qpcr |> dplyr::filter(target == "16S") |> arrange(qpcr_sample_type) |> View()
Code
qpcr |> dplyr::count(qpcr_sample_type, name = "n_wells") |> arrange(-n_wells) |> gt()
qpcr_sample_type n_wells
VIBRANT clinical sample 49530
Standard 4044
Control sample 2112
Empty 1890
Water 540
Water or empty 126
Code
tmp <- 
  qpcr |> 
  dplyr::filter(qpcr_sample_type == "VIBRANT clinical sample", target == target[1]) |>
  group_by(vibr_sample_id) |> 
  summarize(
    n_replicates = n(),
    n_plates = ext_lib_plate_nb |> unique() |> length(),
    any_in_redos = any(in_redos)
  )

# tmp |> dplyr::count(n_plates, any_in_redos) # All samples's replicates are on a single plate

# tmp |> dplyr::count(n_replicates, n_plates, any_in_redos) # All samples have 3 replicates

We note that almost all samples’s replicates are on a single extraction plate (ext_lib_plate_nb or ext_lib_plate_id); and almost all samples have the same number of replicates (3). A few samples (those in_redos) are on two plates and have 6 replicates.

Code
tmp <- 
  qpcr |> 
  dplyr::filter(qpcr_sample_type == "VIBRANT clinical sample", target == "16S") |>
  group_by(vibr_sample_id) |> 
  summarize(
    n_replicates = n(),
    n_plates = ext_lib_plate_nb |> unique() |> length()
  )

# tmp |> dplyr::count(n_plates) # All samples's replicates are on a single plate

# tmp |> dplyr::count(n_replicates) # All samples have 3 replicates
Code
tmp |> 
  dplyr::filter(n_replicates == 6) |> 
  dplyr::left_join(qpcr |> dplyr::filter(target == "16S"), by = join_by(vibr_sample_id)) |> 
  select(vibr_sample_id, ext_lib_plate_nb, ext_lib_position, pcr_plate_id, data_row, well, cq) |> 
  gt()
vibr_sample_id ext_lib_plate_nb ext_lib_position pcr_plate_id data_row well cq
2167612 1 2240890_E4 b062862 163 I08 22.40223
2167612 1 2240890_E4 b062862 178 J07 22.42773
2167612 1 2240890_E4 b062862 179 J08 22.37871
2167612 1 2240890_G5 b062862 242 M10 22.61241
2167612 1 2240890_G5 b062862 258 N09 22.52760
2167612 1 2240890_G5 b062862 259 N10 22.76733

We’ll fix that below.

Code
qpcr <- 
  qpcr |> 
  mutate(
    well_col = well |> str_sub(1, 1), 
    well_row = well |> str_sub(2,3) |> as.integer()
    ) |> 
  relocate(well_col, well_row, .after = well)

We also define a unique “qPCR sample ID” according to each sample type.

Code
qpcr <- 
  qpcr |> 
  mutate(
    qpcr_sample_id = 
      case_when(
        (qpcr_sample_type == "VIBRANT clinical sample") ~ vibr_sample_id,
        (qpcr_sample_type == "Control sample") ~ vibr_sample_id,
        (qpcr_sample_type == "Empty") ~ 
          str_c(vibr_sample_id,"_plate_", 
                ext_lib_plate_nb |> str_pad(width = 2, pad = 0),"_", sample),
        (qpcr_sample_type == "Water") ~ 
          str_c("water_plate", 
                ext_lib_plate_nb |> str_pad(width = 2, pad = 0)),
        (qpcr_sample_type == "Water or empty") ~ 
          str_c("water_or_empty_plate", 
                ext_lib_plate_nb |> str_pad(width = 2, pad = 0),"_", sample),
        (qpcr_sample_type == "Standard") ~ 
          str_c("std_", content |> str_remove("Std-"), 
                "_plate_", ext_lib_plate_nb |> str_pad(width = 2, pad = 0)),
        TRUE ~ NA_character_
      )
  ) |> 
  group_by(qpcr_sample_id, target) |>
  arrange(ext_lib_plate_nb, well_col, well_row) |>
  mutate(replicate_nb = row_number()) |> 
  ungroup() |> 
  mutate(qpcr_uid = str_c(qpcr_sample_id, "_r", replicate_nb))
Code
# qpcr |> dplyr::filter(replicate_nb > 3) |> View()

For the LBP strains, the plate layout is as follows for the first 11 extraction plates:

Code
qpcr_sample_type_colors <- c(
  "VIBRANT clinical sample" = "dodgerblue2",
  "Control sample" = "turquoise",
  "Standard" = "gold",
  "Water" = "lightblue1",
  "Empty" = "gray",
  "Water or empty" = "gray80"
)

qpcr |> 
  dplyr::filter(target != "16S", ext_lib_plate_nb <= 11) |> 
  select(ext_lib_plate_nb, well_col, well_row, qpcr_sample_type, sample) |>
  distinct() |>
  ggplot() +
  aes(x =  well_row |> factor(), y = well_col |> factor() |> fct_rev(), fill = qpcr_sample_type) +
  facet_wrap(~ ext_lib_plate_nb) +
  geom_tile() +
  geom_path(aes(group = sample), col = "black", alpha = 0.5) +
  geom_point() +
  scale_fill_manual(values = qpcr_sample_type_colors) +
  xlab("Well column") + ylab("Well row") +
  labs(caption = "Black lines connect replicates") 

For the 12th plate, the layout is a little bit different and the first PCR plate (for the first group of 3 target) is different from the 4 other:

Code
qpcr |> 
  dplyr::filter(target != "16S", ext_lib_plate_nb == 12) |> 
  select(ext_lib_plate_nb, pcr_plate_id, well_col, well_row, qpcr_sample_type, sample, strain_group_qpcr) |>
  distinct() |>
  ggplot() +
  aes(x =  well_row |> factor(), y = well_col |> factor() |> fct_rev(), fill = qpcr_sample_type) +
  facet_wrap(~ pcr_plate_id + strain_group_qpcr) +
  geom_tile() +
  geom_path(aes(group = sample), col = "black", alpha = 0.5) +
  geom_point() +
  scale_fill_manual(values = qpcr_sample_type_colors) +
  xlab("Well column") + ylab("Well row") +
  labs(caption = "Black lines connect replicates") 

For the 16S plates, the layout is a little different:

Code
qpcr |> 
  dplyr::filter(target == "16S") |> 
  select(ext_lib_plate_nb, pcr_plate_id, well_col, well_row, qpcr_sample_type, qpcr_sample_id) |>
  distinct() |>
  ggplot() +
  aes(x =  well_row |> factor(), y = well_col |> factor() |> fct_rev(), fill = qpcr_sample_type, col = qpcr_sample_type) +
  facet_wrap(~ ext_lib_plate_nb + pcr_plate_id) +
  geom_tile(alpha = 0.25) +
  geom_path(aes(group = qpcr_sample_id), alpha = 0.5, linewidth = 2) +
  geom_point() +
  scale_fill_manual(values = qpcr_sample_type_colors) +
  scale_color_manual(values = qpcr_sample_type_colors) +
  xlab("Well column") + ylab("Well row") +
  labs(caption = "Lines connect replicates") 

Manifest data

We merge the qPCR data with the metagenomics manifest data to add the participant ID (pid) and (visit_code).

But before we do that, we fix the qpcr_sample_id for that sample that had 6 replicates instead of 3 in the 16S qPCR because, after discussion with Michael, we know that one of these barcode was wrong.

Code
six_replicates <- 
  qpcr |> 
  dplyr::filter(target == "16S") |> 
  group_by(qpcr_sample_id) |> 
  mutate(n_replicates = n()) |> 
  ungroup() |> 
  dplyr::filter(n_replicates > 3) |> 
  select(target, vibr_sample_id, qpcr_sample_id, qpcr_uid, ext_lib_position, ext_lib_plate_nb, ext_lib_plate_id)

six_replicates |> gt()
target vibr_sample_id qpcr_sample_id qpcr_uid ext_lib_position ext_lib_plate_nb ext_lib_plate_id
16S 2167612 2167612 2167612_r1 2240890_E4 1 2240890
16S 2167612 2167612 2167612_r2 2240890_E4 1 2240890
16S 2167612 2167612 2167612_r3 2240890_E4 1 2240890
16S 2167612 2167612 2167612_r4 2240890_G5 1 2240890
16S 2167612 2167612 2167612_r5 2240890_G5 1 2240890
16S 2167612 2167612 2167612_r6 2240890_G5 1 2240890

We see that these 6 replicates have 2 different positions on the extraction library plate so we will be able to fix their sample ID from using extraction plate data from the technical metadata manifest using the concatenation of ext_lib_plate_id and ext_lib_position.

Code
data_dir <- get_data_dir(data_source)
mg_dir <- str_c(data_dir, "00 Raw/02 Metagenomics/")
technical_metadata <- 
  readRDS(
    str_c(
      get_output_dir(data_source = data_source),  
      "02_technical_metadata_agg_", today() |> str_remove_all("-"), ".rds"
    )
  )

# SE_mg <- 
#   readRDS(
#     list.files(
#       path = get_output_dir(data_source = data_source), 
#       pattern = "02_se_mg_.*\\.rds$", full.names = TRUE
#     ) |> 
#       sort(decreasing = TRUE) |>
#       extract(1) 
#   )
Code
six_replicates |> 
  mutate(ext_lib_well = ext_lib_position |> str_remove("[0-9]*_")) |> 
  dplyr::left_join(
    technical_metadata |> 
      select(swab_barcode, ext_lib_plate_nb, ext_lib_plate_id, ext_lib_position) |>
      dplyr::rename(
        ext_lib_plate_id_mg_manifest = ext_lib_plate_id,
        ext_lib_well = ext_lib_position
      )
  ) |> gt()
target vibr_sample_id qpcr_sample_id qpcr_uid ext_lib_position ext_lib_plate_nb ext_lib_plate_id ext_lib_well swab_barcode ext_lib_plate_id_mg_manifest
16S 2167612 2167612 2167612_r1 2240890_E4 1 2240890 E4 2171048 2316297
16S 2167612 2167612 2167612_r2 2240890_E4 1 2240890 E4 2171048 2316297
16S 2167612 2167612 2167612_r3 2240890_E4 1 2240890 E4 2171048 2316297
16S 2167612 2167612 2167612_r4 2240890_G5 1 2240890 G5 2167612 2316297
16S 2167612 2167612 2167612_r5 2240890_G5 1 2240890 G5 2167612 2316297
16S 2167612 2167612 2167612_r6 2240890_G5 1 2240890 G5 2167612 2316297

One thing that is weird is that the ext_lib_plate_id in the technical metadata manifest is not the same as the one in the qPCR data:

Code
qpcr |> 
  dplyr::count(ext_lib_plate_nb, ext_lib_plate_id) |> 
  gt(caption = "Extraction library plate nb and IDs as found in the qPCR data.")
Extraction library plate nb and IDs as found in the qPCR data.
ext_lib_plate_nb ext_lib_plate_id n
1 2240890 4992
2 2240900 4992
3 2316397 4992
4 2316390 4992
5 2316420 4992
6 2316419 4992
7 2316095 4992
8 2316093 4992
9 2315117 4992
10 2316391 4992
11 2329564 4992
12 NA 3330
Code
technical_metadata |> 
  dplyr::count(ext_lib_plate_nb, ext_lib_plate_id) |> 
  gt(caption = "Extraction library plate nb and IDs as found in the metagenomics technical metadata.")
Extraction library plate nb and IDs as found in the metagenomics technical metadata.
ext_lib_plate_nb ext_lib_plate_id n
1 2316297 89
2 2240891 88
3 2316307 86
4 2316306 88
5 2316310 87
6 2316311 89
7 2316094 88
8 2316097 88
9 2316392 79
10 2316393 82
11 2329564 85
12 2240891 1
12 2316094 2
12 2316097 2
12 2316297 2
12 2316306 1
12 2316307 2
12 2316310 2
12 2316311 1
12 2316392 2
12 2316393 2
NA NA 63
Warning

@Michael: What are your thoughts on this? Am I assuming they should be the same when there is no reason for them to be the same?

Regardless, since the ext_lib_plate_nb seems to match, we’ll use this to fix that 6-replicates sample. From the table above, it looks like the qpcr_sample_id for the 16S qPCR located in well E4 should be 2171048 instead of 2167612.

We fix their vibr_sample_id, qpcr_sample_id, qpcr_uid, and replicate_nb columns.

Code
qpcr <- 
  qpcr |> 
  mutate(
    needs_change = 
      (qpcr_sample_id == "2167612" & 
         target == "16S" & 
         ext_lib_plate_nb == 1 & 
         ext_lib_position == "2240890_E4"),
    needs_replicates_change = 
      (qpcr_sample_id == "2167612" & 
         target == "16S" & 
         ext_lib_plate_nb == 1 & 
         ext_lib_position == "2240890_G5"),
    qpcr_sample_id = case_when(
      needs_change ~ "2171048",
      TRUE ~ qpcr_sample_id
    ),
    vibr_sample_id = case_when(
      needs_change ~ "2171048",
      TRUE ~ vibr_sample_id
    ),
    qpcr_uid = case_when(
      needs_change ~ qpcr_uid |> str_replace("2167612", "2171048"),
      needs_replicates_change ~ qpcr_uid |> str_replace("r4", "r1") |> str_replace("r5", "r2") |> str_replace("r6", "r3"),
      TRUE ~ qpcr_uid
    ),
    replicate_nb = 
      case_when(
        needs_replicates_change ~ replicate_nb - 3L,
        TRUE ~ replicate_nb
      )
  ) |> 
  select(-needs_change, needs_replicates_change)

Now that this is fixed, we can merge the technical metadata with the qPCR data to be able to build their uid (VIBRANT cross-assay unique identifier) and identify their sample_type, control_type, etc.

Code
# 
# qpcr <- 
#   qpcr |> 
#   dplyr::left_join(
#     SE_mg |>
#       colData() |> as.data.frame() |> as_tibble() |> 
#       select(
#         swab_barcode,
#         uid, mg_uid, 
#         mg_pid, mg_visit_code,
#         sample_type, control_type
#       ) |> 
#       distinct() ,
#     by = join_by("qpcr_sample_id" == "swab_barcode")
#   )

qpcr <- 
  qpcr |> 
  dplyr::left_join(
    technical_metadata |> 
      select(
        swab_barcode,
        uid, mg_uid, 
        mg_pid, mg_visit_code, 
        sample_type, control_type
      ),
    by = join_by("qpcr_sample_id" == "swab_barcode")
  )
Code
qpcr |>
  dplyr::filter(is.na(mg_uid)) |> 
  dplyr::count(content, vibr_sample_id) |> 
  View()
Code
samples_without_entries_in_mg_manifest <- 
  qpcr |> 
  dplyr::filter(qpcr_sample_type %in% c("VIBRANT clinical sample", "Control sample")) |>
  dplyr::filter(is.na(uid)) |> 
  select(qpcr_sample_id, qpcr_sample_type) |> 
  distinct()  |> 
  arrange(qpcr_sample_type, qpcr_sample_id) 

There are 1 clinical or vibrant positive/negative control samples without a matching entry in the metagenomics manifest.

Code
samples_without_entries_in_mg_manifest |> 
  # dplyr::count(qpcr_sample_type) |>
  gt()
qpcr_sample_id qpcr_sample_type
2320838 VIBRANT clinical sample

For these samples, the current uid is missing. We change it to the qpcr_sample_id when NA.

Code
qpcr <- 
  qpcr |> 
  mutate(
    uid = case_when(
      is.na(uid) & 
        (qpcr_sample_type %in% c("VIBRANT clinical sample", "Control sample")) ~ 
        qpcr_sample_id,
      TRUE ~ uid
    )
  )
Code
# mg_samples <- 
#   SE_mg |>
#   colData() |> as.data.frame() |> as_tibble() |> 
#   select(
#     swab_barcode,
#     uid, mg_uid, 
#     mg_pid,mg_visit_code, 
#     sample_type, control_type
#   ) |> 
#   distinct() |> 
#   dplyr::left_join(
#     qpcr |> 
#       select(qpcr_sample_id, qpcr_sample_type) |> 
#       distinct() |> 
#       dplyr::filter(qpcr_sample_type %in% c("VIBRANT clinical sample", "Control sample")),
#     by = join_by("swab_barcode" == "qpcr_sample_id")
#   )

mg_samples <- 
  technical_metadata |> 
  select(
    swab_barcode,
    uid, mg_uid, 
    mg_pid,mg_visit_code, 
    sample_type, control_type,
    ext_lib_plate_nb, no_mg_data
  ) |> 
  distinct() |> 
  dplyr::left_join(
    qpcr |> 
      select(qpcr_sample_id, qpcr_sample_type) |> 
      distinct() |> 
      dplyr::filter(qpcr_sample_type %in% c("VIBRANT clinical sample", "Control sample")),
    by = join_by("swab_barcode" == "qpcr_sample_id")
  ) |> 
  arrange(uid)

# mg_samples |> dplyr::filter(is.na(qpcr_sample_type)) |> nrow()

There are 1 samples for which we have metagenomics data but no qPCR data:

Code
mg_samples |> dplyr::filter(is.na(qpcr_sample_type)) |> gt()
swab_barcode uid mg_uid mg_pid mg_visit_code sample_type control_type ext_lib_plate_nb no_mg_data qpcr_sample_type
MC1 MC1 PC_MC1 Mock1 NA Positive control Mock 1 NA FALSE NA
Code
mg_samples <- 
  technical_metadata |> 
  select(
    swab_barcode,
    uid, mg_uid, 
    mg_pid,mg_visit_code, 
    sample_type, control_type,
    ext_lib_plate_nb, no_mg_data
  ) |> 
  distinct() |> 
  dplyr::left_join(
    qpcr |>
      select(qpcr_sample_id, qpcr_sample_type, target) |> 
      distinct() |> 
      dplyr::filter(
        target == "16S",
        qpcr_sample_type %in% c("VIBRANT clinical sample", "Control sample")
        ),
    by = join_by("swab_barcode" == "qpcr_sample_id")
  ) |> 
  arrange(uid)

And there are 16 samples for which we have metagenomics data but no 16S qPCR data:

Code
mg_samples |> dplyr::filter(is.na(qpcr_sample_type)) |> gt()
swab_barcode uid mg_uid mg_pid mg_visit_code sample_type control_type ext_lib_plate_nb no_mg_data qpcr_sample_type target
2196631 068200127_1100 MG_2196631 68200127 1100 Clinical sample NA FALSE NA NA
2291257 068200278_1400 MG_2291257 68200278 1400 Clinical sample NA FALSE NA NA
2282879 068200281_1500 MG_2282879 68200281 1500 Clinical sample NA FALSE NA NA
2290557 068200357_1500 MG_2290557 68200357 1500 Clinical sample NA FALSE NA NA
2290513 068200363_1500 MG_2290513 68200363 1500 Clinical sample NA FALSE NA NA
2290535 068200366_1500 MG_2290535 68200366 1500 Clinical sample NA FALSE NA NA
2290691 068200370_1300 MG_2290691 68200370 1300 Clinical sample NA FALSE NA NA
2290679 068200373_1300 MG_2290679 68200373 1300 Clinical sample NA FALSE NA NA
2290702 068200375_1300 MG_2290702 68200375 1300 Clinical sample NA FALSE NA NA
2291270 068200388_1300 MG_2291270 68200388 1300 Clinical sample NA FALSE NA NA
2291282 068200398_1400 MG_2291282 68200398 1400 Clinical sample NA FALSE NA NA
2291245 068200407_1200 MG_2291245 68200407 1200 Clinical sample NA FALSE NA NA
2291251 068200409_1300 MG_2291251 68200409 1300 Clinical sample NA FALSE NA NA
2298406 068200414_1500 MG_2298406 68200414 1500 Clinical sample NA FALSE NA NA
2291276 068200415_1300 MG_2291276 68200415 1300 Clinical sample NA FALSE NA NA
MC1 MC1 PC_MC1 Mock1 NA Positive control Mock 1 NA FALSE NA NA
Warning

@Michael: I think these are the “new” samples.

Code
rm(mg_samples, samples_without_entries_in_mg_manifest, tmp)
Code
qpcr |> 
  dplyr::count(qpcr_sample_type, sample_type, control_type)
Code
qpcr <- 
  qpcr |> 
  mutate(
    sample_type = 
      case_when(
        is.na(sample_type) & (qpcr_sample_type == "VIBRANT clinical sample") ~ "Clinical sample",
        is.na(sample_type) & (qpcr_sample_type %in% c("Empty", "Water", "Water or empty")) ~ "Technical negative control",
        is.na(sample_type) & (qpcr_sample_type == "Standard") ~ "Standard",
        TRUE ~ sample_type
      ),
    control_type =
      case_when(
        is.na(control_type) & (qpcr_sample_type != "VIBRANT clinical sample") ~ qpcr_sample_type,
        is.na(control_type) & (qpcr_sample_type == "VIBRANT clinical sample") ~ "",
        TRUE ~ control_type
      )
  )

Target’s plate layout and fluorescent probes

The plate x Fluorescence x Target layout is as follows:

Code
qpcr |> 
  dplyr::count(fluor, target, strain_group_qpcr, pcr_plate_id, ext_lib_plate_nb) |> 
  arrange(ext_lib_plate_nb, strain_group_qpcr, pcr_plate_id, fluor) |> 
  mutate(
    target = target |> fct_inorder(), 
    plate_nb = str_c("ext. plate: ", ext_lib_plate_nb) |> fct_inorder(),
    pcr_plate_id = pcr_plate_id |> fct_inorder()
    ) |> 
  ggplot() +
  aes(
    x = pcr_plate_id,
    y = target |> factor() |> fct_rev(),
    fill = fluor
  ) +
  geom_tile() +
  facet_grid(strain_group_qpcr ~ plate_nb, scales = "free", space = "free") +
  xlab("qPCR plate ID") +
  ylab("Target") +
  scale_fill_manual(
    name = "Fluor.",
    values = c("FAM" = "dodgerblue1", "HEX" = "green3", "Cy5" = "#F8766D", "SYBR" = "green")
  ) +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0)
  )

Potential batch effects due to extraction should be checked for ext_lib_plate_id and those due to the qPCR itself using the pcr_plate_id.

LBP strain information

We also add the LBP strain information to the qPCR data.

Code
SE_mg <-
  readRDS(
    list.files(
      path = get_output_dir(data_source = data_source),
      pattern = "02_se_mg_.*\\.rds$", full.names = TRUE
    ) |>
      sort(decreasing = TRUE) |>
      extract(1)
  )

qpcr <- 
  qpcr |> 
  dplyr::left_join(
    SE_mg |>
      rowData() |> as.data.frame() |> as_tibble() |> 
      dplyr::filter(!is.na(LBP)) |> 
      select(taxon_id, taxon_label, LBP, strain_id, strain_origin, biose_id) |> 
      dplyr::rename(target = taxon_id),
    by = join_by(target)
  )
Code
qpcr <- 
  qpcr |> 
  mutate(taxon_label = taxon_label |> str_replace_na("16S rRNA gene target"))
Code
rm(SE_mg)
Code
qpcr |> 
  arrange(strain_group_qpcr, strain_origin) |>
  mutate(target = target |> fct_inorder()) |> 
  ggplot() +
  aes(x = target, y = strain_group_qpcr, col = strain_group_qpcr) +
  geom_point() +
  facet_grid(. ~ LBP + strain_origin, scales = "free", space = "free")  +
  xlab("Target") + ylab("VMRC group") +
  guides(col = "none") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) 

Relationships between columns

Observations and calibration curves

Observed values are the cq values.

starting_quantity are computed from the cq values using the calibration curves fitted on the standard values.

\[C = f(SQ^*)\]

where \(C\) is the observed cq value for the standard, \(SQ^*\) is the known starting quantity starting_quantity for the standards, and \(f\) is the calibration function.

Note

I assume that, either the standards are not diluted, and that the quant_adjusted and copies_per_swabs are not correct for the standards, or that the standards are diluted and that the calibration curves are fitted on the quant_adjusted:

Code
qpcr |>
  dplyr::filter(qpcr_sample_type == "Standard", target != "16S") |> 
  select(
    sample, starting_quantity, dilution_factor, quant_adjusted, copies_per_swab
  ) |> 
  distinct() |> 
  arrange(sample, dilution_factor) |> 
  gt()
sample starting_quantity dilution_factor quant_adjusted copies_per_swab
std 10^1 1e+01 5 5e+01 1.25e+04
std 10^1 1e+01 10 1e+02 2.50e+04
std 10^1 1e+01 20 2e+02 5.00e+04
std 10^1 1e+01 30 3e+02 7.50e+04
std 10^1 1e+01 NA NA NA
std 10^2 1e+02 5 5e+02 1.25e+05
std 10^2 1e+02 10 1e+03 2.50e+05
std 10^2 1e+02 20 2e+03 5.00e+05
std 10^2 1e+02 30 3e+03 7.50e+05
std 10^2 1e+02 NA NA NA
std 10^3 1e+03 5 5e+03 1.25e+06
std 10^3 1e+03 10 1e+04 2.50e+06
std 10^3 1e+03 20 2e+04 5.00e+06
std 10^3 1e+03 30 3e+04 7.50e+06
std 10^3 1e+03 NA NA NA
std 10^4 1e+04 5 5e+04 1.25e+07
std 10^4 1e+04 10 1e+05 2.50e+07
std 10^4 1e+04 20 2e+05 5.00e+07
std 10^4 1e+04 30 3e+05 7.50e+07
std 10^4 1e+04 NA NA NA
std 10^5 1e+05 5 5e+05 1.25e+08
std 10^5 1e+05 10 1e+06 2.50e+08
std 10^5 1e+05 20 2e+06 5.00e+08
std 10^5 1e+05 30 3e+06 7.50e+08
std 10^5 1e+05 NA NA NA
std 10^6 1e+06 5 5e+06 1.25e+09
std 10^6 1e+06 10 1e+07 2.50e+09
std 10^6 1e+06 20 2e+07 5.00e+09
std 10^6 1e+06 30 3e+07 7.50e+09
std 10^6 1e+06 NA NA NA
std 10^7 1e+07 5 5e+07 1.25e+10
std 10^7 1e+07 10 1e+08 2.50e+10
std 10^7 1e+07 20 2e+08 5.00e+10
std 10^7 1e+07 30 3e+08 7.50e+10
std 10^7 1e+07 NA NA NA
Code
qpcr |>
  dplyr::filter(qpcr_sample_type == "Standard") |> 
  arrange(-dilution_factor) |>
  ggplot() +
  aes(
    # x = starting_quantity |> log10(), 
    x = quant_adjusted |> log10(),
    y = cq, 
    col = dilution_factor |> factor()
  ) +
  geom_point(size = 1, alpha = 0.2) +
  scale_color_manual(
    "Dilution factor", 
    values = colorRampPalette(c("black", "dodgerblue1"))(4)
    ) +
  facet_wrap(. ~ strain_group_qpcr + target, ncol = 9) 

For all clinical samples, the starting_quantity is computed from the cq values using the calibration curve fitted on the standard values.

\[\hat{SQ} = f^{-1}(C)\]

where \(\hat{SQ}\) is the estimated starting quantity starting_quantity for the clinical samples, \(C\) is the observed cq value for the clinical sample, and \(f^{-1}\) is the inverse of the calibration function.

If \(C\) is not observed (i.e., assumed to be larger than the total number of cycles), then \(\hat{SQ}\) is estimated as 0.

Code
# qpcr |> 
#   ggplot() +
#   aes(x = starting_quantity |> log10(), y = cq, col = qpcr_sample_type) +
#   geom_point(size = 0.5, alpha = 0.4) +
#   facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb) 


qpcr |> 
  ggplot() +
  aes(
    x = starting_quantity |> log10(), 
    y = cq, 
    col = ext_lib_plate_nb |> factor(),
    size = (qpcr_sample_type == "Standard")
    ) +
  geom_point(alpha = 0.5) +
  facet_wrap(strain_group_qpcr + target ~ ., ncol = 6) +
  scale_size_manual("Standard", values = c(0.2, 1)) +
  scale_color_viridis_d("Extraction plate number", direction = -1, option = "A", end = 0.8) 

It looks like \(f\) is a linear function of \(\log(SQ^*)\): \(C = \beta_0 + \beta_1 \log(SQ^*)\). This seems appropriate for the LBP strains, but for the total 16S, the standard curves are not linear. Probably not too much of an issue and unlikely to create huge batch effects.

Adjusted quantities

Adjusted quantities (quant_adjusted) are then computed from the estimated starting_quantity values using the dilution factors.

\(\hat{Q} = \hat{SQ} \times d\)

where \(\hat{Q}\) is the estimated adjusted quantity (quant_adjusted), \(\hat{SQ}\) is the estimated starting quantity (starting_quantity) for the clinical samples, and \(d\) is the dilution factor.

Code
qpcr |>
  ggplot() +
  aes(
    x = starting_quantity |> log10(), 
    y = quant_adjusted |> log10(), 
    col = dilution_factor |> factor()
    ) +
  geom_point() +
  facet_wrap(. ~ strain_group_qpcr + target, ncol = 9) +
  scale_color_manual(
    "Dilution factor", 
    values = colorRampPalette(c("dodgerblue1", "gray90"))(4)
    )

quant_adjusted = starting_quantity x dilution_factor:

Code
qpcr |>
  ggplot() +
  aes(
    x = (starting_quantity * dilution_factor) |> log10(), 
    y = (quant_adjusted) |> log10(), 
    col = dilution_factor |> factor()
    ) +
  geom_point() +
  facet_wrap(. ~ strain_group_qpcr + target, ncol = 9) +
  scale_color_manual(
    "Dilution factor", 
    values = colorRampPalette(c("dodgerblue1", "gray90"))(4)
    )

Copies per swab

copies_per_swab are the adjusted_quant multiplied by a constant factor estimated to be the number of bacteria in 1 concentration unit.

Here, that number is 250 for all LBP targets.

Code
# strain_group_qpcr + target

qpcr |>
  dplyr::filter(target != "16S") |> 
  ggplot() +
  aes(
    x = (quant_adjusted * 250) |> log10(), 
    y = copies_per_swab |> log10(), 
    col =  target
    ) +
  geom_abline(intercept = 0, slope = 1, col = "gray") +
  geom_point(size = 0.5, alpha = 0.4) +
  facet_wrap(. ~ ext_lib_plate_nb , ncol = 6) 

For the 16S target, that number is 500.

Code
qpcr |>
  dplyr::filter(target == "16S") |> 
  mutate(
    r = copies_per_swab / (quant_adjusted )
  ) |> 
  pull(r) |> 
  hist()

Dilution factors

Dilution factors are defined per qPCR plate (and consequently, are the same for the 3 targets in the same VMRC group)

Code
qpcr |> 
  dplyr::count(target, dilution_factor) 

qpcr |> 
  group_by(pcr_plate_id, target) |>
  summarize(n_dilution_factors = dilution_factor |> unique() |> length()) 
Code
#|

qpcr |> 
  select(
    starts_with("well"), 
    pcr_plate_id, 
    strain_group_qpcr, 
    ext_lib_plate_nb, 
    dilution_factor
    ) |> 
  distinct() |> 
  ggplot() +
  aes(
    x = well_row |> factor(), 
    y = well_col |> fct_rev(), 
    fill = dilution_factor |> factor()
    ) +
  geom_tile() +
  facet_grid(strain_group_qpcr ~ ext_lib_plate_nb, scales = "free", space = "free") +
  scale_fill_manual(
    "Dilution factor", 
    values = colorRampPalette(c("dodgerblue1", "gray90"))(4)
    ) +
  xlab("Well column") + ylab("Well row") 

Exploratory & QC analyses

CQ and Copies per swabs per well

Code
qpcr |> 
  select(
    starts_with("well"), 
    pcr_plate_id, 
    target,
    strain_group_qpcr, 
    ext_lib_plate_nb, 
    cq
    ) |> 
  distinct() |> 
  ggplot() +
  aes(
    x = well_row |> factor(), 
    y = well_col |> fct_rev(), 
    fill = cq
    ) +
  geom_tile() +
  facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free", space = "free") +
  xlab("Well column") + ylab("Well row") 

Code
qpcr |> 
  select(
    starts_with("well"), 
    pcr_plate_id, 
    target,
    strain_group_qpcr, 
    ext_lib_plate_nb, 
    copies_per_swab
    ) |> 
  distinct() |> 
  ggplot() +
  aes(
    x = well_row |> factor(), 
    y = well_col |> fct_rev(), 
    fill = copies_per_swab |> asinh()
    ) +
  geom_tile() +
  facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free", space = "free") +
  xlab("Well column") + ylab("Well row") 

Warning

From these visualizations of the raw data, something looks weird with the cq and copies_per_swab values for the first 3 plates of “VMRC group 1”.

Total number of copies per swab per sample type

Code
g <- 
  qpcr |> 
  ggplot() +
  aes(x = target, y = copies_per_swab, col = qpcr_sample_type, fill = qpcr_sample_type) +
  geom_boxplot(alpha = 0.5) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_grid(. ~ target_type, scales = "free", space = "free")

g 

Code
g + scale_y_log10()

Code
g <- 
  qpcr |> 
  ggplot() +
  aes(x = target, y = copies_per_swab, col = qpcr_sample_type, fill = qpcr_sample_type) +
  geom_boxplot(alpha = 0.5) +
  facet_grid(qpcr_sample_type ~ ., scales = "free") +
  guides(fill = "none", color = "none") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0)
  ) +
  facet_grid(. ~ target_type, scales = "free", space = "free")

g 

Code
g + scale_y_log10()

Code
g <- 
  qpcr |> 
  ggplot() +
  aes(x = ext_lib_plate_nb |> factor(), y = copies_per_swab, col = target, fill = target) +
  geom_boxplot(alpha = 0.5) +
  facet_grid(qpcr_sample_type ~ ., scales = "free") +
  xlab("Extraction plate number") +
  theme(
    strip.text.y = element_text(angle = 0),
    strip.text.x = element_text(angle = 90, hjust = 0)
  )

g 

Code
g + scale_y_log10() 

Empty and water samples

From the plots above, it looks like the empty and water samples have non-zero values on a few LBP plates.

Code
plot_qpcr_empties <- function(qpcr, color_by = "qpcr_sample_type") {
  qpcr |> 
    dplyr::filter(qpcr_sample_type %in% c("Empty", "Water")) |> 
    group_by(sample, pcr_plate_id, ext_lib_plate_nb, target) |>
    mutate(replicate_nb = row_number()) |> 
    ungroup() |> 
    ggplot() +
    aes(x = sample, y = copies_per_swab, label = replicate_nb, col = !!sym(color_by)) +
    geom_text(size = 3) +
    facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free_x", space = "free_x") +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1),
      strip.text.y = element_text(angle = 0)
    ) 
}
Code
plot_qpcr_empties(qpcr, color_by = "qpcr_sample_type") 

Code
plot_qpcr_empties(qpcr |> mutate(zero_copies = (copies_per_swab == 0)), color_by = "zero_copies") 

Code
prob_target <-
  qpcr |> 
  select(target) |> 
  distinct() |> 
  mutate(prob_target = (target %in% c("C0022A1", "C0059E1", "C0175A1")))
Warning

We see the same issues as above for the first 3 plates of VMRC group 1.

Control samples

Code
qpcr |> 
  dplyr::filter(qpcr_sample_type %in% c("Control sample")) |> 
  group_by(sample, pcr_plate_id, ext_lib_plate_nb, target) |>
  mutate(replicate_nb = row_number()) |> 
  ungroup() |> 
  mutate(
    control_type = control_type |> str_replace_na("? Unknown (not in MG manifest)"),
    vibr_sample_id = vibr_sample_id |> fct_reorder(control_type),
    plate_nb = str_c("ext. plate\n", ext_lib_plate_nb) |> fct_reorder(ext_lib_plate_nb),
    ) |> 
  ggplot() +
  aes(x = vibr_sample_id, y = copies_per_swab, label = replicate_nb, col = control_type) +
  geom_text(size = 3) +
  scale_y_log10() +
  scale_color_discrete("Control type") +
  facet_grid(strain_group_qpcr + target ~ plate_nb, scales = "free_x", space = "free_x") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    # strip.text.x = element_text(angle = 90),
    strip.text.y = element_text(angle = 0)
  ) 

Warning

I am surprise to see that almost all Mock 1 samples are positive (copies per swabs >>) because, these samples were supposed to be pooled samples from the CAP084 study. I don’t think that participants of that study received the LBP. It could also be that I made wrong assumptions when matching the vibr_sample_id with the metagenomics manifest data.

The “negative controls” look better (= more consistent with expectations), except, again, for the first 3 plates of VMRC group 1.

Standards

Code
qpcr |> 
  dplyr::filter(qpcr_sample_type %in% "Standard") |>
  ggplot() +
  aes(x = starting_quantity, y = cq, col = replicate_nb |> factor()) +
  geom_point(alpha = 0.5, size = 0.5) +
  facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb) +
  scale_x_log10() +
  theme(
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

For the 16S, there seems to be “reverse saturation” (i.e., the cq is lower than expected with a linear relationship). Maybe the dilution was not that reliable at such low concentrations.

Code
qpcr |> 
  dplyr::filter(qpcr_sample_type %in% "Standard") |>
  mutate(std_nb = content |> str_remove("Std-") |> parse_number()) |> 
  ggplot() +
  aes(x = starting_quantity, y = cq, col = dilution_factor |> factor()) +
  geom_path(aes(group = interaction(ext_lib_plate_nb, replicate_nb)), alpha = 0.2) +
  geom_text(alpha = 0.2, aes(label = std_nb)) +
  facet_wrap(strain_group_qpcr + target ~ ., ncol = 9) +
  scale_x_log10() +
  scale_color_manual(
    name = "Dilution factor",
    breaks = c(5,10,20,30),
    values = c("black", "deeppink3", "deeppink1", "pink")
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

It looks like standard 4 was not prepared or placed properly for VMRC group 4.

Code
qpcr |> 
  dplyr::filter(qpcr_sample_type %in% "Standard") |>
  mutate(std_nb = content |> str_remove("Std-") |> parse_number()) |> 
  ggplot() +
  aes(x = starting_quantity, y = cq, col = ext_lib_plate_nb |> factor()) +
  geom_path(aes(group = interaction(ext_lib_plate_nb, replicate_nb)), alpha = 0.2) +
  geom_text(alpha = 0.2, aes(label = std_nb)) +
  facet_wrap(strain_group_qpcr + target ~ ., ncol = 9) +
  scale_x_log10() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

This shows again the issues with the first 3 plates of VMRC group 1.

Clinical samples

Code
qpcr |> 
  dplyr::filter(qpcr_sample_type %in% c("VIBRANT clinical sample")) |> 
  group_by(vibr_sample_id, sample, pcr_plate_id, ext_lib_plate_nb, target) |>
  mutate(
    replicate_nb = row_number(),
    median_cps = median(copies_per_swab, na.rm = TRUE),
    mean_cps = mean(copies_per_swab, na.rm = TRUE)
    ) |> 
  ungroup() |> 
  mutate(vibr_sample_id = vibr_sample_id |> fct_reorder(mean_cps)) |>
  ggplot() +
  aes(x = vibr_sample_id, y = copies_per_swab, label = replicate_nb, col = replicate_nb |> factor()) +
  geom_line(aes(group = vibr_sample_id), col = "black", linewidth = 0.1) +
  geom_text(size = 3) +
  scale_y_log10() +
  scale_color_discrete("replicate") +
  scale_x_discrete("Samples", breaks = NULL) +
  facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free_x", space = "free_x") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0)
  )

We notice again the same issues with the VRMC group 1 plate 1-3.

Warning

We could simply discard these values (exclude these targets x plates) or try to identify a new threshold to differentiate between background noise and actual signal.

Replicates

Code
qpcr |> 
  dplyr::filter(qpcr_sample_type %in% c("VIBRANT clinical sample")) |> 
  group_by(qpcr_sample_id, vibr_sample_id, target) |>
  mutate(
    n_replicates = n(),
    replicate_nb = row_number(),
    median_cps = median(copies_per_swab, na.rm = TRUE),
    mean_cps = mean(copies_per_swab, na.rm = TRUE),
    min_ext_lib_plate_nb = min(ext_lib_plate_nb, na.rm = TRUE),
    ) |> 
  ungroup() |> 
  dplyr::filter(n_replicates > 3) |>
  mutate(vibr_sample_id = vibr_sample_id |> fct_reorder(mean_cps)) |>
  ggplot() +
  aes(x = ext_lib_plate_nb |> factor(), y = copies_per_swab, label = replicate_nb, col = (ext_lib_plate_nb == 12)) +
  facet_grid(strain_group_qpcr + target ~ min_ext_lib_plate_nb + vibr_sample_id, scales = "free_x", space = "free_x") +
  geom_text(size = 3) +
  scale_y_log10() +
  scale_color_discrete("replicate") +
  scale_x_discrete("Extraction plate") +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    strip.text.y = element_text(angle = 0),
    strip.text.x = element_text(angle = 90, hjust = 1)
  )

Agreement between pairs of triplicates is good except for the samples that initially were on the first 3 plates for VMRC group 1.

New detection thresholds ?

The idea is to use the negative controls cq values to define plate- and target-specific “detection” thresholds, then use these thresholds to set to 0 the copies_per_swab values for which the cq values are above the threshold.

The figures below show the cq values for the VMRC group 1 and 2.

Code
qpcr |>
  dplyr::filter(strain_group_qpcr %in% c("VMRC_1", "VMRC_2")) |>
  ggplot() +
  aes(
    x = well_row |> factor(), 
    y = well_col |> fct_rev(), 
    fill = cq
    ) +
  geom_tile() +
  facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free", space = "free") +
  xlab("Well column") + ylab("Well row") 

We impute the missing cq values to 45 (well above the maximum observed value, and what I assume was the total number of cycles).

Code
qpcr <- qpcr |> mutate(cq_imp = cq |> replace_na(45)) 
Code
qpcr |>
  dplyr::filter(strain_group_qpcr %in% c("VMRC_1", "VMRC_2")) |>
  ggplot() +
  aes(
    x = well_row |> factor(), 
    y = well_col |> fct_rev(), 
    fill = cq_imp
    ) +
  geom_tile() +
  facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free", space = "free") +
  xlab("Well column") + ylab("Well row") 

Comparing the cq values per sample type

Code
qpcr <- 
  qpcr |>
  mutate(
    simplified_type = 
      case_when(
        str_detect(control_type, "Mock") ~ "Positive control",
        (qpcr_sample_type == "Control sample") & is.na(sample_type) ~ "Unknown control",
        qpcr_sample_type %in% c("Control sample", "Empty", "Water") ~ "Negative control",
        TRUE ~ qpcr_sample_type
      ) |> 
      fct_infreq(),
    qpcr_sample_type = qpcr_sample_type |> fct_infreq()
    ) 

# qpcr |> dplyr::count(simplified_type, sample_type, control_type, qpcr_sample_type)

We define the thresholds as the average over the two smallest cq value for the “empties” (i.e., empty and water samples, as well as the negative controls).

Code
thresholds <- 
  qpcr |> 
  dplyr::filter(target != "16S") |> 
  group_by(target, strain_group_qpcr, ext_lib_plate_nb) |> 
  summarize(
    min_cq_empties = min(cq_imp[simplified_type == "Negative control"], na.rm = TRUE),
    robust_min_cq_empties = cq_imp[simplified_type == "Negative control"] |> sort() |> extract(1:2) |> mean(),
    max_cq_std = max(cq_imp[simplified_type == "Standard"], na.rm = TRUE),
    .groups = "drop"
  ) |> 
  mutate(
    threshold_cq = robust_min_cq_empties
  )
Code
qpcr |> 
  dplyr::filter(
    strain_group_qpcr %in% c("VMRC_1", "VMRC_2"), 
    ext_lib_plate_nb <= 6,
    simplified_type %in% c("VIBRANT clinical sample", "Standard", "Negative control") 
    ) |> 
  arrange(simplified_type) |> 
  ggplot() +
  aes(x = simplified_type |> str_wrap(width = 15), y = cq_imp, col = qpcr_sample_type) + #  
  geom_violin(col = "gray") +
  geom_jitter(height = 0, width = 0.25, alpha = 0.5, size = 0.75) +
  geom_hline(
    data = thresholds |> dplyr::filter(strain_group_qpcr %in% c("VMRC_1", "VMRC_2"), ext_lib_plate_nb <= 6), 
    aes(yintercept = threshold_cq), col = "steelblue1"
    ) +
  facet_grid(target ~ ext_lib_plate_nb) +
  scale_color_manual(values = c("steelblue1", "black", "dodgerblue3", "red","pink") |> rev()) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  ) +
  xlab("Sample type") 

Code
qpcr <- 
  qpcr |> 
  select(-any_of(c("min_cq_empties","robust_min_cq_empties", "max_cq_std", "threshold_cq"))) |> 
  dplyr::left_join(thresholds, by = join_by(ext_lib_plate_nb, strain_group_qpcr, target)) 

For all plates and targets

Code
qpcr |>
  dplyr::filter(qpcr_sample_type == "VIBRANT clinical sample") |> 
  ggplot() +
  aes(x = cq_imp, y = target) +
  geom_violin(fill = "gray90", col = "transparent") +
  geom_jitter(width = 0, size = 0.1, col = "pink") +
  geom_vline(aes(xintercept = min_cq_empties), col = "dodgerblue") +
  geom_vline(aes(xintercept = robust_min_cq_empties), col = "green3") +
  geom_vline(aes(xintercept = max_cq_std), col = "red") +
  facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free") +
  theme(strip.text.y = element_text(angle = 0)) +
  labs(
    x = "cq value (imputed to 45 when not avalaible)",
    caption = 
      str_c(
        "Pink dots are the cq values for the VIBRANT clinical samples\n",
        "The blue lines are the minimum cq value for the negative controls\n",
        "The green lines are the average of the two smallest cq values for the negative controls (= the detection threshold)\n",
        "The red lines are the maximum cq value for the standards"
      )
  )

Adjusted copies per swabs

We create a new column copies_per_swab_adj that is set to 0 if the cq_imp is above the threshold.

Code
qpcr <- 
  qpcr |> 
  mutate(
    copies_per_swab_adj = 
      case_when(
        (target != "16S") & (cq_imp >= threshold_cq) ~ 0,
        TRUE ~ copies_per_swab
      )
  )
Code
qpcr |> 
  dplyr::filter(qpcr_sample_type %in% c("VIBRANT clinical sample")) |> 
  group_by(qpcr_sample_id, sample, pcr_plate_id, ext_lib_plate_nb, target) |>
  mutate(
    replicate_nb = row_number(),
    median_cps = median(copies_per_swab_adj, na.rm = TRUE),
    mean_cps = mean(copies_per_swab_adj, na.rm = TRUE)
    ) |> 
  ungroup() |> 
  mutate(qpcr_sample_id = qpcr_sample_id |> fct_reorder(mean_cps)) |>
  ggplot() +
  aes(x = qpcr_sample_id, y = copies_per_swab_adj, label = replicate_nb, col = replicate_nb |> factor()) +
  geom_line(aes(group = qpcr_sample_id), col = "black", linewidth = 0.1) +
  geom_text(size = 3) +
  scale_y_log10() +
  scale_color_discrete("replicate") +
  scale_x_discrete("Samples", breaks = NULL) +
  facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free_x", space = "free_x") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0)
  )

This does not fully solve the problem, but I hope that at the “aggregation” step, taking the median value would remove a lot of false positive.

If not, a more stringent threshold could potentially be used, or these samples would have to be excluded from the analysis.

Aggregation across replicates

Code
qpcr <- 
  qpcr |> 
  group_by(qpcr_sample_id, target) |> 
  mutate(
    copies_per_swab_raw_median = median(copies_per_swab, na.rm = TRUE),
    copies_per_swab_adj_median = median(copies_per_swab_adj, na.rm = TRUE),
    copies_per_swab_adj_range = max(copies_per_swab_adj, na.rm = TRUE) - min(copies_per_swab_adj, na.rm = TRUE),
  ) |> 
  ungroup()
Code
qpcr |> 
  dplyr::filter(qpcr_sample_type %in% c("VIBRANT clinical sample")) |>
  group_by(qpcr_sample_id) |> mutate(n_target_detected = sum(copies_per_swab_raw_median > 0)) |> ungroup() |>
  mutate(qpcr_sample_id = qpcr_sample_id |> fct_reorder(n_target_detected)) |>
  ggplot() +
  aes(x = qpcr_sample_id, y = target, fill = copies_per_swab_raw_median |> log10()) +
  geom_tile() +
  facet_grid(strain_group_qpcr + LBP  ~ ext_lib_plate_nb, scales = "free", space = "free") +
  scale_fill_gradient(low = "dodgerblue1", high = "black", na.value = "white") +
  theme(
    axis.text.x = element_blank(),
    legend.position = "bottom"
  ) +
  ggtitle("Median of the original `copies_per_swabs`")

Code
qpcr |> 
  dplyr::filter(qpcr_sample_type %in% c("VIBRANT clinical sample")) |>
  group_by(qpcr_sample_id) |> mutate(n_target_detected = sum(copies_per_swab_adj_median > 0)) |> ungroup() |>
  mutate(qpcr_sample_id = qpcr_sample_id |> fct_reorder(n_target_detected)) |>
  ggplot() +
  aes(x = qpcr_sample_id, y = target, fill = copies_per_swab_adj_median |> log10()) +
  geom_tile() +
  facet_grid(strain_group_qpcr + LBP  ~ ext_lib_plate_nb, scales = "free", space = "free") +
  scale_fill_gradient(low = "dodgerblue1", high = "black", na.value = "white") +
  theme(
    axis.text.x = element_blank(),
    legend.position = "bottom"
  ) +
  ggtitle("Median of the adjusted `copies_per_swabs`")

The threshold correction likely helped in reducing the number of false positive.

Longitudinal profiles

When looking at the longitudinal patterns for randomized participants, we do not notice an over-representation of a specific plate or group in the “isolated” positives:

Code
qpcr |> 
  dplyr::filter(
    sample_type %in% c("Clinical sample"), 
    # sample_category %in% c("Expected sample"),
    mg_visit_code %in% c(seq(1000, 1900, by = 100), 2120)
  ) |>
  group_by(mg_pid) |> mutate(n_target_detected = sum(copies_per_swab_adj_median[!is.na(LBP)] > 0)) |> ungroup() |>
  mutate(mg_pid = mg_pid |> fct_reorder(n_target_detected)) |>
  ggplot() +
  aes(
    x = mg_pid, y = mg_visit_code, 
    fill = ext_lib_plate_nb |> factor(), 
    alpha = copies_per_swab_adj_median |> log10()
  ) +
  geom_tile() +
  facet_grid(strain_group_qpcr + target ~ ., scales = "free") +
  scale_fill_discrete("Extraction plate number") +
  scale_alpha_continuous(limits = c(0, 10)) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
    strip.text.y = element_text(angle = 0),
    legend.position = "bottom"
  )

We should thus expect some false positive in the LBP qPCR data :)

Code
qpcr |> 
  dplyr::filter(
    target != "16S",
    sample_type %in% c("Clinical sample"), 
    mg_visit_code %in% c(0, seq(1000, 1900, by = 100), 2120)
  ) |>
  # group_by(mg_pid) |> mutate(n_target_detected = sum(copies_per_swab_adj_median[!is.na(LBP)] > 0)) |> ungroup() |>
  # mutate(mg_pid = mg_pid |> fct_reorder(n_target_detected)) |>
  mutate(mg_pid = mg_pid |> factor()) |>
  mutate(target = target |> fct_reorder(as.numeric(factor(LBP)) + as.numeric(factor(strain_origin))/10)) |> 
  ggplot() +
  aes(
    x = target, y = mg_pid |> fct_rev(), 
    fill = LBP,
    alpha = copies_per_swab_adj_median |> log10()
  ) +
  geom_tile() +
  facet_grid(. ~ mg_visit_code) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
    # strip.text.y = element_text(angle = 0),
    legend.position = "bottom"
  )

There are clear patterns in the data, hinting at the participants’ arms :) However, this also suggests that there might be quite a lot of false positive and a low specificity for the qPCR probes.

Code
qpcr |> 
  dplyr::filter(
    sample_type %in% c("Clinical sample"), 
    # sample_category %in% c("Expected sample"),
    mg_visit_code %in% c(seq(1000, 1900, by = 100), 2120),
    target == "16S"
    ) |>  
  arrange(mg_pid, mg_visit_code) |> 
  select(mg_pid, mg_visit_code, copies_per_swab_adj_median) |>
  distinct() |> 
  ggplot() +
  aes(x = mg_visit_code, y = copies_per_swab_adj_median, col = mg_pid) +
  geom_boxplot(alpha = 0.2, fill = "dodgerblue1", color = "dodgerblue1") +
  geom_line(aes(group = mg_pid), alpha = 0.2) +
  # geom_point(size = 0.5) +
  ggbeeswarm::geom_quasirandom(width = 0.2) +
  scale_y_log10() +
  guides(col = "none") +
  ggtitle("16S copies per swab") 

Specificity and sensitivity estimation from Mock positive controls and from screening visits samples

In this section, we use the biological controls (Mock samples) to estimate the sensitivity (true positive rate) of the qPCR assays and the screening visit samples to estimate the specificity (false positive rate).

Our true positive samples are the Mock samples since they contain, among other strains, the 15 LBP strains.

The relative abundance of each LBP strains in Mock 1 samples should be around 1/100, while in Mock 2 samples, it should be around 1/24.

Unfortunately, we do not have a good biological negative sample (e.g., a pool of non-LBP strains), so we use the screening visit samples as our “true negative samples”, acknowledging that some of these samples might contain native L. crispatus that may be very closely related to one of the LBP strains.

Code
tmp <- 
  qpcr |> 
  dplyr::filter(
    (sample_type == "Positive control") | ((mg_visit_code <= 1000) & (sample_type == "Clinical sample"))
  ) |> 
  select(
    uid, mg_visit_code, sample_type, control_type, ext_lib_plate_nb, target, strain_group_qpcr, LBP, strain_origin, copies_per_swab_adj_median
  ) |> 
  arrange(uid) |> 
  distinct() |> 
  mutate(
    truth = 
      case_when(
        str_detect(control_type, "Mock") ~ "P",
        (mg_visit_code <= 1000) ~ "N",
        TRUE ~ NA_character_
      ),
    predicted = 
      case_when(
        copies_per_swab_adj_median > 0 ~ "PP",
        TRUE ~ "PN"
      )
  )
Code
tmp |> 
  ggplot() +
  aes(
    x = sample_type, y = copies_per_swab_adj_median |> asinh(), 
    col = str_c(sample_type, " ",control_type |> as.character(), " (", truth, ")")
    ) +
  facet_grid(. ~ strain_group_qpcr + target) +
  scale_color_discrete("Control type") + xlab("") +
  ggbeeswarm::geom_quasirandom(size = 0.7, alpha = 0.8) +
  theme(
    strip.text.x = element_text(angle = 90),
    axis.text.x = element_blank()
  )

Code
tmp <- 
  tmp |> 
  dplyr::filter(target != "16S") |> 
  dplyr::left_join(
    tmp |> 
      dplyr::filter(target == "16S") |> 
      select(uid, copies_per_swab_adj_median) |> 
      dplyr::rename(copies_per_swab_adj_median_16S = copies_per_swab_adj_median),
    by = join_by(uid)
  )
Code
tmp <- 
  tmp |> 
  mutate(
    exp_rel_ab = 
      case_when(
        (control_type == "Mock 1") ~ 1 / 100,
        (control_type == "Mock 2") ~ 1 / 24,
        TRUE ~ 0
      ),
    obs_rel_ab = copies_per_swab_adj_median / copies_per_swab_adj_median_16S
  )
Code
g <- 
  tmp |> 
  ggplot() +
  aes(
    x = str_c(LBP, strain_origin, target, sep = " - "), y = obs_rel_ab, 
    col = str_c(sample_type, " ", control_type, " (", truth, ")")
  ) +
  facet_grid(.  ~ sample_type + control_type) + # , scales = "free"
  geom_hline(aes(yintercept = exp_rel_ab), col = "gray50") +
  geom_boxplot(alpha = 0, outlier.shape = NA) +
  geom_point(alpha = 0.3) +
  scale_color_discrete("Control type") + 
  ylab("Observed relative abundance") + 
  xlab("LBP strain") + 
  labs(
    caption = "The horizontal lines are the expected relative abundance of each LBP strain in the control samples."
  ) +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)
  )

g

Code
g + ylim(c(0, 0.25)) 

Code
g + ylim(c(0, 0.05)) 

Code
tmp |> 
  group_by(target, strain_group_qpcr, LBP, strain_origin) |>
  summarize(
    `n True Negatives` = sum(truth == "N" & predicted == "PN"),
    `n Negatives` = sum(truth == "N"),
    specificity = `n True Negatives` / `n Negatives`,
    `n True Positives` = sum(truth == "P" & predicted == "PP"),
    `n Positives` = sum(truth == "P"),
    sensitivity = sum(truth == "P" & predicted == "PP") / sum(truth == "P"),
    .groups = "drop"
  ) |> 
  arrange(strain_group_qpcr, target) |> 
  gt(caption = "Specificity and sensitivity of the qPCR assays for the LBP targets") |>
  cols_label(
    specificity ~ "Specificity (%) [likely underestimated]",
    sensitivity ~ "Sensitivity (%)",
    strain_origin ~ "Strain origin",
    strain_group_qpcr ~ "qPCR group",
    target ~ "Target (LBP strain)"
  ) |>
  fmt_number(
    columns = c(specificity, sensitivity),
    decimals = 2,
    scale_by = 100
  ) 
Specificity and sensitivity of the qPCR assays for the LBP targets
Target (LBP strain) qPCR group LBP Strain origin n True Negatives n Negatives Specificity (%) [likely underestimated] n True Positives n Positives Sensitivity (%)
C0022A1 VMRC_1 LC-106 & LC-115 US 270 276 97.83 21 22 95.45
C0059E1 VMRC_1 LC-106 & LC-115 US 256 276 92.75 21 22 95.45
C0175A1 VMRC_1 LC-106 & LC-115 US 261 276 94.57 21 22 95.45
FF00018 VMRC_2 LC-106 & LC-115 SA 266 276 96.38 21 22 95.45
FF00051 VMRC_2 LC-106 & LC-115 SA 258 276 93.48 21 22 95.45
UC101 VMRC_2 LC-106 & LC-115 SA 264 276 95.65 21 22 95.45
185329 VMRC_3 LC-115 SA 276 276 100.00 15 22 68.18
C0028A1 VMRC_3 LC-115 US 274 276 99.28 13 22 59.09
C0112A1 VMRC_3 LC-115 US 274 276 99.28 21 22 95.45
122010 VMRC_4 LC-115 SA 264 276 95.65 21 22 95.45
C0006A1 VMRC_4 LC-115 US 271 276 98.19 21 22 95.45
FF00004 VMRC_4 LC-115 SA 273 276 98.91 21 22 95.45
FF00064 VMRC_5 LC-115 SA 271 276 98.19 21 22 95.45
FF00072 VMRC_5 LC-115 SA 275 276 99.64 21 22 95.45
UC119 VMRC_5 LC-115 SA 276 276 100.00 21 22 95.45
Code
tmp |> 
  dplyr::filter(sample_type == "Clinical sample") |> 
  mutate(
    site = case_when(
      str_detect(uid, "06810") ~ "MGH",
      str_detect(uid, "06820") ~ "CAP",
      TRUE ~ "Other"
    )
  ) |> 
  ggplot() +
  aes(
    x = site, y = copies_per_swab_adj_median |> asinh(), 
    col = site
    ) +
  facet_grid(. ~ strain_origin + target) +
  scale_color_manual("Study site", values = site_colors) + 
  ggbeeswarm::geom_quasirandom(size = 0.7, alpha = 0.8) +
  theme(
    strip.text.x = element_text(angle = 90),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  )

It looks like there might be a mild correlation between strain origin and study site for screening visit samples.

Creating SummarizedExperiment objects

We create two Summarized Experiment objects:

  • one with all values as provided in the original file, where each “sample” is a plate well and each feature is a target (LBP taxa)

  • one with summarized values (aggregated across replicates) where each “sample” is a VIBRANT sample ID (including “biological” control samples) and each feature is a target (LBP taxa)

Raw SummarizedExperiment object

We create a SummarizedExperiment object with the following assays:

  • dilution_factor
  • cq
  • starting_quantity
  • cq_mean
  • quant_adjusted
  • copies_per_swab_raw (current copies_per_swab column)
  • copies_per_swab (current copies_per_swab_adj column)

The “samples” are defined by the qpcr_uid column (one per well and plate) and the “features” are defined by the target column (i.e., the LBP taxa).

The colData contains the following columns:

  • qpcr_uid (the qPCR specific unique identifier)
  • uid (the VIBRANT cross-assay unique sample identifier)
  • qpcr_sample_id
  • vibr_sample_id
  • qpcr_sample_type
  • sample_type
  • control_type
  • ext_lib_plate_nb
  • ext_lib_plate_id

The rowData contains the following columns:

  • fluor
  • strain_group_qpcr
  • pcr_plate_id (the concatenation of pcr_plate_id for each ext_lib_plate_nb)
  • taxon_label
  • LBP
  • strain_id
  • strain_origin
  • biose_id
Code
qpcr |> dplyr::count(qpcr_uid) |> dplyr::count(n)

qpcr |> dplyr::count(qpcr_uid) |> nrow()


qpcr |> dplyr::count(qpcr_uid) |> dplyr::filter(n != 16) |> View()

qpcr |> 
  select(
    qpcr_uid, 
    uid, mg_pid, mg_visit_code,
    qpcr_sample_type,  sample_type, control_type, 
    qpcr_sample_id, vibr_sample_id, replicate_nb, 
    ext_lib_plate_nb, ext_lib_plate_id
  ) |> 
  distinct() |> 
  group_by(qpcr_uid) |> mutate(n = n()) |> ungroup() |> 
  dplyr::filter(n > 1) |> 
  arrange(qpcr_uid)

qpcr |> 
  dplyr::filter(qpcr_sample_id == "1588") |> 
  select(qpcr_sample_id, qpcr_uid, pcr_plate_id, well_row, well_col, ext_lib_plate_nb, ext_lib_plate_id, target) |> 
  arrange(target, pcr_plate_id,  qpcr_uid, well_row, well_col) |> 
  View()
Code
make_raw_qpcr_SE <- function(qpcr){
  
  qpcr <- 
    qpcr |> 
    arrange(ext_lib_plate_nb, well_col, well_row) |>
    mutate(qpcr_uid = qpcr_uid |> fct_inorder()) |> 
    arrange(strain_group_qpcr, fluor) |> 
    mutate(target = target |> fct_inorder()) |> 
    arrange(qpcr_uid, target)
    
  # ASSAYS
  
  dilution_assay <- 
    qpcr |>
    select(qpcr_uid, target, dilution_factor) |> 
    arrange() |> 
    pivot_wider(names_from = qpcr_uid, values_from = dilution_factor) |> 
    as.data.frame() |> 
    column_to_rownames("target") 
  
  cq_assay <- 
    qpcr |>
    select(qpcr_uid, target, cq) |> 
    arrange() |> 
    pivot_wider(names_from = qpcr_uid, values_from = cq) |> 
    as.data.frame() |> 
    column_to_rownames("target") 
  
  starting_quantity_assay <- 
    qpcr |>
    select(qpcr_uid, target, starting_quantity) |> 
    arrange() |> 
    pivot_wider(names_from = qpcr_uid, values_from = starting_quantity) |> 
    as.data.frame() |> 
    column_to_rownames("target")
  
   cq_mean_assay <- 
    qpcr |>
    select(qpcr_uid, target, cq_mean) |> 
    arrange() |> 
    pivot_wider(names_from = qpcr_uid, values_from = cq_mean) |> 
    as.data.frame() |> 
    column_to_rownames("target")
   
   quant_adjusted_assay <- 
     qpcr |>
     select(qpcr_uid, target, quant_adjusted) |> 
     arrange() |> 
     pivot_wider(names_from = qpcr_uid, values_from = quant_adjusted) |> 
     as.data.frame() |> 
     column_to_rownames("target")
   
   copies_per_swab_raw_assay <- 
     qpcr |>
     select(qpcr_uid, target, copies_per_swab) |> 
     arrange() |> 
     pivot_wider(names_from = qpcr_uid, values_from = copies_per_swab) |> 
     as.data.frame() |> 
     column_to_rownames("target")
   
  copies_per_swab_assay <- 
    qpcr |>
    select(qpcr_uid, target, copies_per_swab_adj) |> 
    arrange() |> 
    pivot_wider(names_from = qpcr_uid, values_from = copies_per_swab_adj) |> 
    as.data.frame() |> 
    column_to_rownames("target")
  
    # COLDATA
   coldata <- 
     qpcr |> 
     select(
       qpcr_uid, 
       uid, mg_pid, mg_visit_code,
       qpcr_sample_type,  sample_type, control_type, 
       qpcr_sample_id, vibr_sample_id, replicate_nb, 
       ext_lib_plate_nb, ext_lib_plate_id
       ) |> 
     distinct() |> 
     arrange(qpcr_uid) |> 
     as.data.frame() |> 
     mutate(rownames = qpcr_uid) |> 
     column_to_rownames("rownames") 
  

    # ROWDATA
   rowdata <-
     qpcr |> 
     select(
       target, fluor, strain_group_qpcr, 
       pcr_plate_id, ext_lib_plate_nb, 
       taxon_label, LBP, strain_id, strain_origin, biose_id
       ) |> 
     distinct() |> 
     group_by(
       target, fluor, strain_group_qpcr, 
       taxon_label, LBP, strain_id, strain_origin, biose_id
       ) |> 
     arrange(target, ext_lib_plate_nb) |> 
     mutate(
       pcr_plate_id = str_c(pcr_plate_id, " (extr. plate ", ext_lib_plate_nb |> str_pad(width = 2, pad = "0"), ")", sep = ""),
     ) |> 
     summarize(
       pcr_plate_ids = pcr_plate_id |> unique() |> str_c(collapse = ", "),
       .groups = "drop"
     ) |> 
     ungroup() |> 
     as.data.frame() |> 
     mutate(rownames = target) |> 
     column_to_rownames("rownames")

  
   
   SE <- SummarizedExperiment(
     assays = list(
       dilution = dilution_assay |> as.matrix(),
       cq = cq_assay |> as.matrix(),
       cq_mean = cq_mean_assay |> as.matrix(),
       starting_quantity = starting_quantity_assay |> as.matrix(),
       quant_adjusted = quant_adjusted_assay |> as.matrix(),
       copies_per_swab_raw = copies_per_swab_raw_assay |>  as.matrix(),
       copies_per_swab = copies_per_swab_assay |> as.matrix()
     ),
     colData = coldata,
     rowData = rowdata,
     metadata = list(
       description = "Raw qPCR data from the VIBRANT study",
       date = today(),
       assay_and_coldata_dictionary = dictionary
     )
   )
   
   SE
  
}
Code
SE_qPCR_raw <- make_raw_qpcr_SE(qpcr)
SE_qPCR_raw
# A SummarizedExperiment-tibble abstraction: 61,536 × 30
# Features=16 | Samples=3846 | Assays=dilution, cq, cq_mean, starting_quantity,
#   quant_adjusted, copies_per_swab_raw, copies_per_swab
   .feature .sample      dilution    cq cq_mean starting_quantity quant_adjusted
   <chr>    <chr>           <dbl> <dbl>   <dbl>             <dbl>          <dbl>
 1 C0175A1  std_01_plat…       10  15.3    15.2          10000000      100000000
 2 C0022A1  std_01_plat…       10  16.0    16.0          10000000      100000000
 3 C0059E1  std_01_plat…       10  15.4    15.4          10000000      100000000
 4 UC101    std_01_plat…       10  13.9    13.9          10000000      100000000
 5 FF00018  std_01_plat…       10  14.6    14.6          10000000      100000000
 6 FF00051  std_01_plat…       10  14.5    14.6          10000000      100000000
 7 185329   std_01_plat…       30  14.0    13.9          10000000      300000000
 8 C0028A1  std_01_plat…       30  14.1    14.0          10000000      300000000
 9 C0112A1  std_01_plat…       30  14.6    14.6          10000000      300000000
10 FF00004  std_01_plat…       10  14.6    14.6          10000000      100000000
# ℹ 310 more rows
# ℹ 23 more variables: copies_per_swab_raw <dbl>, copies_per_swab <dbl>,
#   qpcr_uid <fct>, uid <chr>, mg_pid <chr>, mg_visit_code <chr>,
#   qpcr_sample_type <fct>, sample_type <chr>, control_type <chr>,
#   qpcr_sample_id <chr>, vibr_sample_id <chr>, replicate_nb <int>,
#   ext_lib_plate_nb <dbl>, ext_lib_plate_id <dbl>, target <fct>, fluor <chr>,
#   strain_group_qpcr <chr>, taxon_label <chr>, LBP <fct>, strain_id <fct>, …

Aggregated SummarizedExperiment object

We create a SummarizedExperiment object with summarized values (aggregated across replicates) where each “sample” is a VIBRANT sample ID (including control samples) and each feature is a target (LBP taxa), with the following assays

  • dilution
  • copies_per_swab_med (the median copies_per_swab across replicates)
  • copies_per_swab_mean (the mean copies_per_swab across replicates)
  • copies_per_swab_cv (the coefficient of variation of copies_per_swab across replicates)

The colData contains the same columns as above (except for those that are replicate-specific)

The rowData contains the same columns as above,

Code
agg_qpcr_SE <- function(SE_qPCR_raw){
  
  filtered_qpcr <- 
    SE_qPCR_raw |> 
    dplyr::filter(
      !(qpcr_sample_type %in% c("Standard", "Water", "Empty")),
      !is.na(uid) 
      ) 
    
  summarized_qpcr <- 
    filtered_qpcr |>
    as_tibble() |> 
    group_by(uid, target) |> 
    summarize(
      n_non_na = sum(!is.na(copies_per_swab)),
      dilution = mean(dilution, na.rm = TRUE),
      copies_per_swab_med = median(copies_per_swab, na.rm = TRUE),
      copies_per_swab_mean = mean(copies_per_swab, na.rm = TRUE),
      copies_per_swab_sd = sd(copies_per_swab, na.rm = TRUE),
      copies_per_swab_range = max(copies_per_swab, na.rm = TRUE) - min(copies_per_swab, na.rm = TRUE),
      .groups = "drop"
    ) |> 
    mutate(
      copies_per_swab_cv = ifelse(copies_per_swab_sd == 0, 0, copies_per_swab_sd / copies_per_swab_mean)
    ) 

  # COLDATA
  coldata <- 
    filtered_qpcr |> 
    colData() |> 
    as.data.frame() |> 
    as_tibble() |> 
    select(-qpcr_uid, -replicate_nb, -starts_with("ext_lib")) |> 
    distinct() |>  
    arrange(uid)
  
  coldata <-  
    coldata |> 
    dplyr::left_join(
      filtered_qpcr |> 
        colData() |> 
        as.data.frame() |> 
        as_tibble() |> 
        select(uid, starts_with("ext_lib")) |> 
        group_by(uid) |> 
        summarize(
          ext_lib_plate_nb = str_c(ext_lib_plate_nb |> unique() |> sort(), collapse = ", "),
          ext_lib_plate_id = str_c(ext_lib_plate_id |> str_replace_na("unknown") |> unique() |> sort(), collapse = ", "),
          .groups = "drop"
        ),
      by = join_by(uid)
    )
  
   coldata <-  
    coldata |> 
    as.data.frame() |> 
    mutate(rownames = uid) |> 
    column_to_rownames("rownames")
  
  # ROWDATA
   rowdata <-
     filtered_qpcr |> 
     rowData() |> 
     as.data.frame()
    
   SE <- SummarizedExperiment(
     assays = list(
       n_non_na = 
         summarized_qpcr |> 
         pivot_wider(id_cols = target, names_from = uid, values_from = n_non_na) |> 
         as.data.frame() |> 
         column_to_rownames("target") |> 
         as.matrix(),
        dilution = 
         summarized_qpcr |> 
         pivot_wider(id_cols = target, names_from = uid, values_from = dilution) |> 
         as.data.frame() |> 
         column_to_rownames("target") |> 
         as.matrix(),
       copies_per_swab_med = 
         summarized_qpcr |> 
         pivot_wider(id_cols = target, names_from = uid, values_from = copies_per_swab_med) |> 
         as.data.frame() |> 
         column_to_rownames("target") |> 
         as.matrix(),
       copies_per_swab_mean = 
         summarized_qpcr |> 
         pivot_wider(id_cols = target, names_from = uid, values_from = copies_per_swab_mean) |> 
         as.data.frame() |> 
         column_to_rownames("target") |> 
         as.matrix(),
       copies_per_swab_cv = 
         summarized_qpcr |> 
         pivot_wider(id_cols = target, names_from = uid, values_from = copies_per_swab_cv) |> 
         as.data.frame() |> 
         column_to_rownames("target") |> 
         as.matrix()
     ),
     colData = coldata,
     rowData = rowdata,
     metadata = list(
       description = "Aggregated qPCR data from the VIBRANT study",
       date = today(),
       assay_and_coldata_dictionary = SE_qPCR_raw@metadata$assay_and_coldata_dictionary
     )
   )
   
   SE
     
}
Code
SE_qPCR_agg <- agg_qpcr_SE(SE_qPCR_raw = SE_qPCR_raw)
SE_qPCR_agg
# A SummarizedExperiment-tibble abstraction: 16,464 × 26
# Features=16 | Samples=1029 | Assays=n_non_na, dilution, copies_per_swab_med,
#   copies_per_swab_mean, copies_per_swab_cv
   .feature .sample   n_non_na dilution copies_per_swab_med copies_per_swab_mean
   <chr>    <chr>        <int>    <dbl>               <dbl>                <dbl>
 1 C0175A1  06810000…        3        5                   0                    0
 2 C0022A1  06810000…        3        5                   0                    0
 3 C0059E1  06810000…        3        5                   0                    0
 4 UC101    06810000…        3        5                   0                    0
 5 FF00018  06810000…        3        5                   0                    0
 6 FF00051  06810000…        3        5                   0                    0
 7 185329   06810000…        3       20                   0                    0
 8 C0028A1  06810000…        3       20                   0                    0
 9 C0112A1  06810000…        3       20                   0                    0
10 FF00004  06810000…        3       20                   0                    0
# ℹ 310 more rows
# ℹ 20 more variables: copies_per_swab_cv <dbl>, uid <chr>, mg_pid <chr>,
#   mg_visit_code <chr>, qpcr_sample_type <fct>, sample_type <chr>,
#   control_type <chr>, qpcr_sample_id <chr>, vibr_sample_id <chr>,
#   ext_lib_plate_nb <chr>, ext_lib_plate_id <chr>, target <fct>, fluor <chr>,
#   strain_group_qpcr <chr>, taxon_label <chr>, LBP <fct>, strain_id <fct>,
#   strain_origin <fct>, biose_id <chr>, pcr_plate_ids <chr>

Save SummarizedExperiment objects

We first check that the SE_qPCR_agg object is formatted as it should for its integration in the MAE.

Code
# We remove `mg_pid` and `mg_visit_code` from the colData to avoid conflict when merging with the metagenomics data.
colData(SE_qPCR_agg) <- colData(SE_qPCR_agg)[, -which(colnames(colData(SE_qPCR_agg)) %in% c("mg_pid", "mg_visit_code"))]
# We check that the `SE_qPCR_agg` object is formatted as it should
SE_qPCR_agg <- check_se(SE_qPCR_agg)

Save the SE objects to disk

Code
saveRDS(
  SE_qPCR_raw, 
  str_c(
    get_output_dir(data_source = data_source),  
    "03_se_pcr_raw_", today() |> str_remove_all("-"), ".rds"
    )
  )


saveRDS(
  SE_qPCR_agg, 
  str_c(
    get_output_dir(data_source = data_source),  
    "03_se_pcr_agg_", today() |> str_remove_all("-"), ".rds"
    )
  )